library(tidyverse)

MMETSP

Read raw data.

mmetsp_raw_data <- read_tsv('../data/sample-attr.tab.txt')

── Column specification ────────────────────────────────────────────────────────────────────────────────────────────
cols(
  sample_id = col_double(),
  sample_name = col_character(),
  attr_type = col_character(),
  attr_value = col_character()
)
mmetsp_raw_data

Unpack attributes.

mmetsp_wider <- mmetsp_raw_data %>%
  rename(sample_name_main = sample_name) %>%
  pivot_wider(id_cols = c('sample_id', 'sample_name_main'), names_from = "attr_type", values_from = "attr_value", names_repair = "unique")
Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates
mmetsp_wider
mmetsp_fixed <- mmetsp_wider %>%
  # only keep first sample name
  rowwise() %>%
  mutate(sample_name = sample_name[[1]])

mmetsp_fixed
NA
mmetsp_wider %>%
  summarise(across(everything(), ~ max(lengths(.x)))) %>%
  t()
                                          [,1]
sample_id                                    1
sample_name_main                             1
GenBank BioSample                            1
NCBI SRA                                     1
project_id                                   1
source_mat_id                                4
sample_name                                  2
latitude                                     2
longitude                                    2
habitat_name                                 1
taxon_id                                     1
strain                                       1
genus                                        1
species                                      1
family                                       1
depth                                        1
sample_collection_site                       1
class                                        1
phylum                                       1
assembly_accession_number                    1
collection_date                              1
date_of_experiment                           1
day_portion_of_day_night_cycle_in_hours      1
envo_term_for_habitat_primary_term           1
growth_medium                                1
investigation_type                           1
light                                        1
other_collection_site_info                   1
sample_material                              1
experimental_salinity                        1
experimental_temperature                     1
fastq_file                                   1
night_portion_of_day_night_cycle_in_hours    1
primary_citation                             2
longhurst_province                           1
ph                                           1
clonal                                       1
envo_term_for_habitat_secondary_term         1
habitat_description                          1
prey_organism_if_applicable                  1
environmental_salinity                       1
other_experimental_metadata_available        2
country                                      2
axenic                                       1
modifications_to_growth_medium               1
additional_citations                         2
collection_time                              1
other_environmental_metadata_available       2
environmental_temperature                    1
phosphate                                    1
nitrate                                      1
iron                                         1
trace_elements                               1
pressure                                     1
carbon_dioxide                               1
ammonium                                     1
pcr_amp                                      1
silicate                                     1
sample_type                                  1
sample_description                           1
collection_start_time                        1
collection_stop_time                         1
site_name                                    1
site_description                             1
library_acc                                  1
sequencing_method                            1
dna_type                                     1
comments                                     1
order                                        1
superkingdom                                 1
combined_assembly_name                       1
external_sample_id                           1
habitat                                      1
principle_investigator                       1
sample_volume                                1
volume_unit                                  1
filter_min                                   1
filter_max                                   1
filter_fraction_maximum                      1
filter_fraction_minimum                      1
volume_filtered                              1
urea                                         1
chlorophyll                                  1
elevation                                    1
dissolved_oxygen                             1
particulate_organic_carbon                   1
region                                       1

Select and unnest taxon info.

mmetsp_taxon <- mmetsp_wider %>%
  select(sample_id, sample_name_main, taxon_id, phylum, class, order, genus, species, strain, fastq_file, latitude, longitude) %>%
  unnest() %>%
  mutate(
    genus_species_strain = gsub(" ", "_", paste(genus, species, strain, sep = "_"))
  )
`cols` is now required when using unnest().
Please use `cols = c(taxon_id, phylum, class, order, genus, species, strain, fastq_file, 
    latitude, longitude)`
mmetsp_taxon
mmetsp_taxon %>%
  drop_na(latitude)

Select only barebones.

mmetsp_select <- mmetsp_taxon %>%
  select('sample_id', 'sample_name_main', 'taxon_id', 'genus_species_strain', 'fastq_file')

Genbank

colNames <- "assembly_accession, bioproject, biosample, wgs_master, refseq_category, taxid, species_taxid, organism_name, infraspecific_name, infraspecific_name2, isolateversion_status, assembly_level, release_type, genome_rep, seq_rel_date, asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, excluded_from_refseq, relation_to_type_material"
colNamesVec <- unlist(str_split(colNames, ", "))
genbank <- read_tsv('../data/assembly_summary_genbank.txt',
                    comment = "#",
                    col_names = colNamesVec) %>%
  mutate(taxid = as.character(taxid),
         species_taxid = as.character(species_taxid))

── Column specification ────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  taxid = col_double(),
  species_taxid = col_double(),
  seq_rel_date = col_date(format = "")
)
ℹ Use `spec()` for the full column specifications.

46385 parsing failures.
row col   expected     actual                                   file
  3  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 12  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 26  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 95  -- 22 columns 9 columns  '../data/assembly_summary_genbank.txt'
 96  -- 22 columns 9 columns  '../data/assembly_summary_genbank.txt'
... ... .......... .......... ......................................
See problems(...) for more details.
genbank_select <- genbank %>%
  select('taxid', 'species_taxid', 'organism_name', 'genome_rep', 'ftp_path')

genbank_select

Join

mmetsp_taxon
genbank_select <- genbank_select %>%
  mutate(taxid = as.character(taxid),
         species_taxid = as.character(species_taxid))
genbank_select
genbank
suffix <- "_genomic.fna.gz"

join_taxid <- inner_join(mmetsp_taxon, genbank, by = c('taxon_id' = 'taxid')) %>%
  distinct(sample_id, .keep_all = TRUE) %>%
  distinct(taxon_id, .keep_all = TRUE) %>%
  rowwise() %>%
  mutate(
    genome_filename = paste(tail(str_split(ftp_path, '/')[[1]], 1), suffix, sep = ""),
    
    genome_ftp_path = paste(ftp_path, genome_filename, sep = "/")
  )
  
join_taxid
join

Write

CSV of all info:

write_csv(join_taxid, '../data/mmetsp_ncbi_genome_info.csv')

FTP paths:

readr::write_lines(join_taxid$genome_ftp_path, '../data/genome_ftp_paths.txt')
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCiMgTU1FVFNQCgpSZWFkIHJhdyBkYXRhLgpgYGB7cn0KbW1ldHNwX3Jhd19kYXRhIDwtIHJlYWRfdHN2KCcuLi9kYXRhL3NhbXBsZS1hdHRyLnRhYi50eHQnKQptbWV0c3BfcmF3X2RhdGEKYGBgCgpVbnBhY2sgYXR0cmlidXRlcy4KYGBge3J9Cm1tZXRzcF93aWRlciA8LSBtbWV0c3BfcmF3X2RhdGEgJT4lCiAgcmVuYW1lKHNhbXBsZV9uYW1lX21haW4gPSBzYW1wbGVfbmFtZSkgJT4lCiAgcGl2b3Rfd2lkZXIoaWRfY29scyA9IGMoJ3NhbXBsZV9pZCcsICdzYW1wbGVfbmFtZV9tYWluJyksIG5hbWVzX2Zyb20gPSAiYXR0cl90eXBlIiwgdmFsdWVzX2Zyb20gPSAiYXR0cl92YWx1ZSIsIG5hbWVzX3JlcGFpciA9ICJ1bmlxdWUiKQoKbW1ldHNwX3dpZGVyCmBgYAoKCmBgYHtyfQptbWV0c3BfZml4ZWQgPC0gbW1ldHNwX3dpZGVyICU+JQogICMgb25seSBrZWVwIGZpcnN0IHNhbXBsZSBuYW1lCiAgcm93d2lzZSgpICU+JQogIG11dGF0ZShzYW1wbGVfbmFtZSA9IHNhbXBsZV9uYW1lW1sxXV0pCgptbWV0c3BfZml4ZWQKICAKYGBgCgpgYGB7cn0KbW1ldHNwX3dpZGVyICU+JQogIHN1bW1hcmlzZShhY3Jvc3MoZXZlcnl0aGluZygpLCB+IG1heChsZW5ndGhzKC54KSkpKSAlPiUKICB0KCkKYGBgCgpTZWxlY3QgYW5kIHVubmVzdCB0YXhvbiBpbmZvLgpgYGB7cn0KbW1ldHNwX3RheG9uIDwtIG1tZXRzcF93aWRlciAlPiUKICBzZWxlY3Qoc2FtcGxlX2lkLCBzYW1wbGVfbmFtZV9tYWluLCB0YXhvbl9pZCwgcGh5bHVtLCBjbGFzcywgb3JkZXIsIGdlbnVzLCBzcGVjaWVzLCBzdHJhaW4sIGZhc3RxX2ZpbGUsIGxhdGl0dWRlLCBsb25naXR1ZGUpICU+JQogIHVubmVzdCgpICU+JQogIG11dGF0ZSgKICAgIGdlbnVzX3NwZWNpZXNfc3RyYWluID0gZ3N1YigiICIsICJfIiwgcGFzdGUoZ2VudXMsIHNwZWNpZXMsIHN0cmFpbiwgc2VwID0gIl8iKSkKICApCm1tZXRzcF90YXhvbgpgYGAKCmBgYHtyfQptbWV0c3BfdGF4b24gJT4lCiAgZHJvcF9uYShsYXRpdHVkZSkKYGBgCgoKU2VsZWN0IG9ubHkgYmFyZWJvbmVzLgpgYGB7cn0KbW1ldHNwX3NlbGVjdCA8LSBtbWV0c3BfdGF4b24gJT4lCiAgc2VsZWN0KCdzYW1wbGVfaWQnLCAnc2FtcGxlX25hbWVfbWFpbicsICd0YXhvbl9pZCcsICdnZW51c19zcGVjaWVzX3N0cmFpbicsICdmYXN0cV9maWxlJykKYGBgCgojIEdlbmJhbmsKCmBgYHtyfQpjb2xOYW1lcyA8LSAiYXNzZW1ibHlfYWNjZXNzaW9uLCBiaW9wcm9qZWN0LCBiaW9zYW1wbGUsIHdnc19tYXN0ZXIsIHJlZnNlcV9jYXRlZ29yeSwgdGF4aWQsIHNwZWNpZXNfdGF4aWQsIG9yZ2FuaXNtX25hbWUsIGluZnJhc3BlY2lmaWNfbmFtZSwgaW5mcmFzcGVjaWZpY19uYW1lMiwgaXNvbGF0ZXZlcnNpb25fc3RhdHVzLCBhc3NlbWJseV9sZXZlbCwgcmVsZWFzZV90eXBlLCBnZW5vbWVfcmVwLCBzZXFfcmVsX2RhdGUsIGFzbV9uYW1lLCBzdWJtaXR0ZXIsIGdicnNfcGFpcmVkX2FzbSwgcGFpcmVkX2FzbV9jb21wLCBmdHBfcGF0aCwgZXhjbHVkZWRfZnJvbV9yZWZzZXEsIHJlbGF0aW9uX3RvX3R5cGVfbWF0ZXJpYWwiCmNvbE5hbWVzVmVjIDwtIHVubGlzdChzdHJfc3BsaXQoY29sTmFtZXMsICIsICIpKQpgYGAKCmBgYHtyfQpnZW5iYW5rIDwtIHJlYWRfdHN2KCcuLi9kYXRhL2Fzc2VtYmx5X3N1bW1hcnlfZ2VuYmFuay50eHQnLAogICAgICAgICAgICAgICAgICAgIGNvbW1lbnQgPSAiIyIsCiAgICAgICAgICAgICAgICAgICAgY29sX25hbWVzID0gY29sTmFtZXNWZWMpICU+JQogIG11dGF0ZSh0YXhpZCA9IGFzLmNoYXJhY3Rlcih0YXhpZCksCiAgICAgICAgIHNwZWNpZXNfdGF4aWQgPSBhcy5jaGFyYWN0ZXIoc3BlY2llc190YXhpZCkpCmBgYAoKYGBge3J9CmdlbmJhbmtfc2VsZWN0IDwtIGdlbmJhbmsgJT4lCiAgc2VsZWN0KCd0YXhpZCcsICdzcGVjaWVzX3RheGlkJywgJ29yZ2FuaXNtX25hbWUnLCAnZ2Vub21lX3JlcCcsICdmdHBfcGF0aCcpCgpnZW5iYW5rX3NlbGVjdApgYGAKCiMgSm9pbgoKYGBge3J9Cm1tZXRzcF90YXhvbgpgYGAKCmBgYHtyfQpnZW5iYW5rX3NlbGVjdCA8LSBnZW5iYW5rX3NlbGVjdCAlPiUKICBtdXRhdGUodGF4aWQgPSBhcy5jaGFyYWN0ZXIodGF4aWQpLAogICAgICAgICBzcGVjaWVzX3RheGlkID0gYXMuY2hhcmFjdGVyKHNwZWNpZXNfdGF4aWQpKQpnZW5iYW5rX3NlbGVjdApgYGAKYGBge3J9CmdlbmJhbmsKYGBgCgpgYGB7cn0Kc3VmZml4IDwtICJfZ2Vub21pYy5mbmEuZ3oiCgpqb2luX3RheGlkIDwtIGlubmVyX2pvaW4obW1ldHNwX3RheG9uLCBnZW5iYW5rLCBieSA9IGMoJ3RheG9uX2lkJyA9ICd0YXhpZCcpKSAlPiUKICBkaXN0aW5jdChzYW1wbGVfaWQsIC5rZWVwX2FsbCA9IFRSVUUpICU+JQogIGRpc3RpbmN0KHRheG9uX2lkLCAua2VlcF9hbGwgPSBUUlVFKSAlPiUKICByb3d3aXNlKCkgJT4lCiAgbXV0YXRlKAogICAgZ2Vub21lX2ZpbGVuYW1lID0gcGFzdGUodGFpbChzdHJfc3BsaXQoZnRwX3BhdGgsICcvJylbWzFdXSwgMSksIHN1ZmZpeCwgc2VwID0gIiIpLAogICAgCiAgICBnZW5vbWVfZnRwX3BhdGggPSBwYXN0ZShmdHBfcGF0aCwgZ2Vub21lX2ZpbGVuYW1lLCBzZXAgPSAiLyIpCiAgKQogIApqb2luX3RheGlkCmBgYAoKYGBge3J9CmpvaW4KYGBgCgoKIyBXcml0ZSAKQ1NWIG9mIGFsbCBpbmZvOgpgYGB7cn0Kd3JpdGVfY3N2KGpvaW5fdGF4aWQsICcuLi9kYXRhL21tZXRzcF9uY2JpX2dlbm9tZV9pbmZvLmNzdicpCmBgYAoKRlRQIHBhdGhzOgpgYGB7cn0KcmVhZHI6OndyaXRlX2xpbmVzKGpvaW5fdGF4aWQkZ2Vub21lX2Z0cF9wYXRoLCAnLi4vZGF0YS9nZW5vbWVfZnRwX3BhdGhzLnR4dCcpCmBgYAoKCg==